{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time\n",
    "import random\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))\n",
    "ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0\n",
    "train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, \n",
    "                                                                               trainset.target, \n",
    "                                                                               ONEHOT, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 20332\n",
      "Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]\n",
      "Sample data [544, 2559, 3142, 10271, 36, 7703, 217, 151, 19, 3801] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']\n"
     ]
    }
   ],
   "source": [
    "concat = ' '.join(trainset.data).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "GO = dictionary['GO']\n",
    "PAD = dictionary['PAD']\n",
    "EOS = dictionary['EOS']\n",
    "UNK = dictionary['UNK']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.python.ops.rnn_cell import RNNCell\n",
    "from tensorflow.contrib.rnn.python.ops import core_rnn_cell\n",
    "\n",
    "def linear(args, output_size, bias, bias_start=0.0, scope=None):\n",
    "    if args is None or (isinstance(args, (list, tuple)) and not args):\n",
    "        raise ValueError(\"`args` must be specified\")\n",
    "    if not isinstance(args, (list, tuple)):\n",
    "        args = [args]\n",
    "\n",
    "    # Calculate the total size of arguments on dimension 1.\n",
    "    total_arg_size = 0\n",
    "    shapes = [a.get_shape().as_list() for a in args]\n",
    "    for shape in shapes:\n",
    "        if len(shape) != 2:\n",
    "            raise ValueError(\n",
    "                \"Linear is expecting 2D arguments: %s\" % str(shapes))\n",
    "        if not shape[1]:\n",
    "            raise ValueError(\n",
    "                \"Linear expects shape[1] of arguments: %s\" % str(shapes))\n",
    "        else:\n",
    "            total_arg_size += shape[1]\n",
    "\n",
    "    with tf.variable_scope(scope or \"Linear\"):\n",
    "        matrix = tf.get_variable(\"Matrix\", [total_arg_size, output_size])\n",
    "        if len(args) == 1:\n",
    "            res = tf.matmul(args[0], matrix)\n",
    "        else:\n",
    "            res = tf.matmul(tf.concat(1, args), matrix)\n",
    "        if not bias:\n",
    "            return res\n",
    "        bias_term = tf.get_variable(\n",
    "            \"Bias\", [output_size],\n",
    "            initializer=tf.constant_initializer(bias_start))\n",
    "    return res + bias_term\n",
    "\n",
    "class SRUCell(RNNCell):\n",
    "\n",
    "    def __init__(self, num_units, activation=None, reuse=None):\n",
    "        self._num_units = num_units\n",
    "        self._activation = activation or tf.tanh\n",
    "\n",
    "    @property\n",
    "    def output_size(self):\n",
    "        return self._num_units\n",
    "\n",
    "    @property\n",
    "    def state_size(self):\n",
    "        return self._num_units\n",
    "\n",
    "    def __call__(self, inputs, state, scope='SRUCell'):\n",
    "\n",
    "        with tf.variable_scope(scope):\n",
    "            with tf.variable_scope(\"Inputs\"):\n",
    "                x = linear([inputs], self._num_units, False)\n",
    "            with tf.variable_scope(\"Gate\"):\n",
    "                concat = tf.sigmoid(linear([inputs], 2 * self._num_units, True))\n",
    "                f, r = tf.split(axis=1, num_or_size_splits=2, value=concat)\n",
    "                    \n",
    "            c = f * state + (1 - f) * x\n",
    "            h = r * self._activation(c) + (1 - r) * inputs\n",
    "\n",
    "        return h, c\n",
    "\n",
    "class Model:\n",
    "    def __init__(self, size_layer, num_layers, embedded_size,\n",
    "                 dict_size, dimension_output):\n",
    "        \n",
    "        def cells(reuse=False):\n",
    "            return SRUCell(size_layer,reuse=reuse)\n",
    "        \n",
    "        self.X = tf.placeholder(tf.int32, [None, None])\n",
    "        self.Y = tf.placeholder(tf.float32, [None, dimension_output])\n",
    "        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))\n",
    "        embedded = tf.nn.embedding_lookup(embeddings, self.X)\n",
    "        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])\n",
    "        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype = tf.float32)\n",
    "        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())\n",
    "        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())\n",
    "        self.logits = tf.matmul(outputs[:, -1], W) + b\n",
    "        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))\n",
    "        self.optimizer = tf.train.AdamOptimizer(learning_rate = 1e-3).minimize(self.cost)\n",
    "        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "size_layer = 128\n",
    "num_layers = 2\n",
    "embedded_size = 128\n",
    "dimension_output = len(trainset.target_names)\n",
    "maxlen = 50\n",
    "batch_size = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From <ipython-input-6-9f2b5c716f38>:80: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "\n",
      "Future major versions of TensorFlow will allow gradients to flow\n",
      "into the labels input on backprop by default.\n",
      "\n",
      "See @{tf.nn.softmax_cross_entropy_with_logits_v2}.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, pass acc: 0.000000, current acc: 0.554199\n",
      "time taken: 3.4405574798583984\n",
      "epoch: 0, training loss: 0.689039, training acc: 0.535748, valid loss: 0.680192, valid acc: 0.554199\n",
      "\n",
      "epoch: 1, pass acc: 0.554199, current acc: 0.583984\n",
      "time taken: 3.066664934158325\n",
      "epoch: 1, training loss: 0.630970, training acc: 0.649858, valid loss: 0.671964, valid acc: 0.583984\n",
      "\n",
      "epoch: 2, pass acc: 0.583984, current acc: 0.626465\n",
      "time taken: 3.135775327682495\n",
      "epoch: 2, training loss: 0.532484, training acc: 0.734730, valid loss: 0.686642, valid acc: 0.626465\n",
      "\n",
      "epoch: 3, pass acc: 0.626465, current acc: 0.646484\n",
      "time taken: 3.11407732963562\n",
      "epoch: 3, training loss: 0.408517, training acc: 0.818892, valid loss: 0.734317, valid acc: 0.646484\n",
      "\n",
      "epoch: 4, pass acc: 0.646484, current acc: 0.653809\n",
      "time taken: 3.2255287170410156\n",
      "epoch: 4, training loss: 0.288211, training acc: 0.884233, valid loss: 0.821830, valid acc: 0.653809\n",
      "\n",
      "time taken: 3.1103758811950684\n",
      "epoch: 5, training loss: 0.185976, training acc: 0.932528, valid loss: 0.953978, valid acc: 0.650391\n",
      "\n",
      "time taken: 3.197439193725586\n",
      "epoch: 6, training loss: 0.107132, training acc: 0.967685, valid loss: 1.143521, valid acc: 0.649414\n",
      "\n",
      "epoch: 7, pass acc: 0.653809, current acc: 0.655273\n",
      "time taken: 3.0919787883758545\n",
      "epoch: 7, training loss: 0.056378, training acc: 0.986387, valid loss: 1.360909, valid acc: 0.655273\n",
      "\n",
      "epoch: 8, pass acc: 0.655273, current acc: 0.659180\n",
      "time taken: 3.200023651123047\n",
      "epoch: 8, training loss: 0.028726, training acc: 0.994437, valid loss: 1.571254, valid acc: 0.659180\n",
      "\n",
      "time taken: 3.148078441619873\n",
      "epoch: 9, training loss: 0.014215, training acc: 0.997869, valid loss: 1.786603, valid acc: 0.658203\n",
      "\n",
      "time taken: 3.2151803970336914\n",
      "epoch: 10, training loss: 0.008183, training acc: 0.999290, valid loss: 1.966834, valid acc: 0.652344\n",
      "\n",
      "time taken: 2.972202777862549\n",
      "epoch: 11, training loss: 0.004848, training acc: 0.999645, valid loss: 2.058176, valid acc: 0.657715\n",
      "\n",
      "time taken: 3.206712007522583\n",
      "epoch: 12, training loss: 0.003433, training acc: 0.999408, valid loss: 2.168201, valid acc: 0.651367\n",
      "\n",
      "time taken: 3.1262435913085938\n",
      "epoch: 13, training loss: 0.002158, training acc: 0.999882, valid loss: 2.243634, valid acc: 0.654297\n",
      "\n",
      "break epoch:14\n",
      "\n"
     ]
    }
   ],
   "source": [
    "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0\n",
    "while True:\n",
    "    lasttime = time.time()\n",
    "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
    "        print('break epoch:%d\\n'%(EPOCH))\n",
    "        break\n",
    "        \n",
    "    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
    "    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):\n",
    "        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)\n",
    "        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], \n",
    "                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})\n",
    "        train_loss += loss\n",
    "        train_acc += acc\n",
    "    \n",
    "    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):\n",
    "        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)\n",
    "        acc, loss = sess.run([model.accuracy, model.cost], \n",
    "                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})\n",
    "        test_loss += loss\n",
    "        test_acc += acc\n",
    "    \n",
    "    train_loss /= (len(train_X) // batch_size)\n",
    "    train_acc /= (len(train_X) // batch_size)\n",
    "    test_loss /= (len(test_X) // batch_size)\n",
    "    test_acc /= (len(test_X) // batch_size)\n",
    "    \n",
    "    if test_acc > CURRENT_ACC:\n",
    "        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))\n",
    "        CURRENT_ACC = test_acc\n",
    "        CURRENT_CHECKPOINT = 0\n",
    "    else:\n",
    "        CURRENT_CHECKPOINT += 1\n",
    "        \n",
    "    print('time taken:', time.time()-lasttime)\n",
    "    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'%(EPOCH,train_loss,\n",
    "                                                                                          train_acc,test_loss,\n",
    "                                                                                          test_acc))\n",
    "    EPOCH += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.66      0.65      0.66      1087\n",
      "   positive       0.64      0.65      0.65      1046\n",
      "\n",
      "avg / total       0.65      0.65      0.65      2133\n",
      "\n"
     ]
    }
   ],
   "source": [
    "logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})\n",
    "print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
